In [3]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import MinMaxScaler,StandardScaler,normalize
import matplotlib.pyplot as plt 
#importing the header files 
from collections import Counter

from sklearn.manifold import TSNE
import time
import sklearn
from sklearn.decomposition import PCA 

from sklearn import mixture


from sklearn.preprocessing import MinMaxScaler,StandardScaler,LabelEncoder
from sklearn.model_selection import GridSearchCV

import plotly.express as px
from collections import Counter 
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import umap
from audio_results_util import load_dataset,plotClusters,encodedLabels,plotData3d,plotData2d,plotAllClusterModels,actualDistribution
%matplotlib inline
In [4]:
train_data,true_labels,true_encoded_labels=load_dataset('Audio_features_vowel/formantsLpc_muslim_features_vowel.csv')
In [5]:
train_data.shape
Out[5]:
(240, 3)
In [6]:
results_df=pd.read_csv('audio-results4/formantsLpc_muslim_features_vowel.csv')
results_df
Out[6]:
Unnamed: 0 n_clusters ARI AMI H C V FM A R P silhouette calinski davies
0 K-Means 5 0.049107 0.033785 0.077170 0.052793 0.062695 0.380333 0.316667 0.170712 0.193956 0.326590 153.247194 0.951929
1 Agglomerative clustering 2 0.001517 -0.000223 0.009134 0.024759 0.013344 0.613968 0.587500 0.191485 0.140000 0.494066 142.037178 0.663303
2 Birch 2 0.023662 -0.005365 0.005049 0.010304 0.006777 0.582460 0.550000 0.194291 0.143750 0.463457 172.287907 0.830984
3 DBSCAN 5 0.067154 0.011672 0.050904 0.054873 0.052814 0.538391 0.491667 0.135462 0.143849 0.184679 22.170117 1.694045
4 Mean-shift 2 0.019027 0.006378 0.011238 0.098177 0.020167 0.684363 0.650000 0.196226 0.133333 0.465083 27.744034 0.802514
5 Optics 2 -0.028867 0.000363 0.018190 0.034640 0.023853 0.575271 0.041667 0.022254 0.155747 -0.151371 66.696111 1.953891
6 Gaussian-mixture 5 0.072207 0.042929 0.081175 0.070010 0.075180 0.482986 0.433333 0.160555 0.161822 0.255497 90.341606 1.237190
In [7]:
actualDistribution(train_data,true_labels,true_encoded_labels)
plotAllClusterModels(train_data,'audio-results4/formantsLpc_muslim_features_vowel.csv')
Actual Distribution of the labels in the dataset: --> 
Counter({'a': 159, 'u': 47, 'misaligned': 13, 'cantdecide': 13, 'other': 8})
Counter({0: 159, 4: 47, 2: 13, 1: 13, 3: 8})
audio-results4/formantsLpc_muslim_features_vowel.csv_kmeans_labels.npy
K-Means
predicted_labels--> Counter({0: 91, 2: 58, 3: 34, 1: 31, 4: 26})
 2D representation
 3D representation
Agglomerative
predicted_labels--> Counter({0: 210, 1: 30})
 2D representation
 3D representation
Birch
predicted_labels--> Counter({0: 192, 1: 48})
 2D representation
 3D representation
DBSCAN
predicted_labels--> Counter({0: 168, -1: 47, 4: 8, 2: 6, 1: 6, 3: 5})
 2D representation
 3D representation
OPTICS
predicted_labels--> Counter({-1: 201, 1: 29, 0: 10})
 2D representation
 3D representation
MEAN-SHIFT
predicted_labels--> Counter({0: 234, 1: 6})
 2D representation
 3D representation
Gaussian-Mixture
predicted_labels--> Counter({0: 144, 3: 41, 4: 24, 1: 21, 2: 10})
 2D representation
 3D representation
In [ ]:
 

PCA transformed data

In [ ]:
 
In [8]:
results_df=pd.read_csv('audio-results4/formantsLpc_muslim_features_vowel.csv-pca.csv')
results_df
Out[8]:
Unnamed: 0 n_clusters ARI AMI H C V FM A R P silhouette calinski davies
0 K-Means 5 0.049107 0.033785 0.077170 0.052793 0.062695 0.380333 0.358333 0.265884 0.257903 0.326590 153.247194 0.951929
1 Agglomerative clustering 2 0.001517 -0.000223 0.009134 0.024759 0.013344 0.613968 0.587500 0.191485 0.140000 0.494066 142.037178 0.663303
2 Birch 2 0.023662 -0.005365 0.005049 0.010304 0.006777 0.582460 0.550000 0.194291 0.143750 0.463457 172.287907 0.830984
3 DBSCAN 5 0.067154 0.011672 0.050904 0.054873 0.052814 0.538391 0.491667 0.135462 0.143849 0.184679 22.170117 1.694045
4 Mean-shift 2 0.019027 0.006378 0.011238 0.098177 0.020167 0.684363 0.650000 0.196226 0.133333 0.465083 27.744034 0.802514
5 Optics 2 -0.028867 0.000363 0.018190 0.034640 0.023853 0.575271 0.041667 0.022254 0.155747 -0.151371 66.696111 1.953891
6 Gaussian-mixture 5 0.011818 0.044626 0.089590 0.061674 0.073056 0.351785 0.225000 0.167672 0.164909 0.231068 114.324735 1.132191
In [9]:
pca_transformed=PCA(n_components=3).fit_transform(train_data)
pca_transformed=pd.DataFrame(pca_transformed)
actualDistribution(pca_transformed,true_labels,true_encoded_labels)
plotAllClusterModels(pca_transformed,'audio-results4/formantsLpc_muslim_features_vowel.csv-pca')
Actual Distribution of the labels in the dataset: --> 
Counter({'a': 159, 'u': 47, 'misaligned': 13, 'cantdecide': 13, 'other': 8})
Counter({0: 159, 4: 47, 2: 13, 1: 13, 3: 8})
audio-results4/formantsLpc_muslim_features_vowel.csv-pca_kmeans_labels.npy
K-Means
predicted_labels--> Counter({0: 91, 2: 58, 1: 34, 4: 31, 3: 26})
 2D representation
 3D representation
Agglomerative
predicted_labels--> Counter({0: 210, 1: 30})
 2D representation
 3D representation
Birch
predicted_labels--> Counter({0: 192, 1: 48})
 2D representation
 3D representation
DBSCAN
predicted_labels--> Counter({0: 168, -1: 47, 4: 8, 2: 6, 1: 6, 3: 5})
 2D representation
 3D representation
OPTICS
predicted_labels--> Counter({-1: 201, 1: 29, 0: 10})
 2D representation
 3D representation
MEAN-SHIFT
predicted_labels--> Counter({0: 234, 1: 6})
 2D representation
 3D representation
Gaussian-Mixture
predicted_labels--> Counter({1: 79, 0: 70, 4: 44, 2: 30, 3: 17})
 2D representation
 3D representation
In [ ]:
 

t-SNE transformed data

In [10]:
results_df=pd.read_csv('audio-results4/formantsLpc_muslim_features_vowel.csv-tsne.csv')
results_df
Out[10]:
Unnamed: 0 n_clusters ARI AMI H C V FM A R P silhouette calinski davies
0 K-Means 5 0.023734 6.578346e-02 0.118764 0.075576 0.092371 0.328058 0.141667 0.154505 0.154306 0.317404 133.420679 0.975012
1 Agglomerative clustering 240 0.000000 -6.363593e-14 1.000000 0.186350 0.314157 0.000000 0.004167 0.000026 0.004167 -1.000000 -1.000000 -1.000000
2 Birch 240 0.000000 -6.363593e-14 1.000000 0.186350 0.314157 0.000000 0.000000 0.000000 0.000000 -1.000000 -1.000000 -1.000000
3 DBSCAN 3 0.083096 5.747927e-02 0.082219 0.084475 0.083332 0.514053 0.475000 0.190131 0.166824 0.035523 20.666571 3.899713
4 Mean-shift 1 0.000000 -5.797594e-17 0.000000 1.000000 0.000000 0.694318 0.662500 0.200000 0.132500 -1.000000 -1.000000 -1.000000
5 Optics 1 0.069242 3.906352e-02 0.033263 0.132812 0.053201 0.674273 0.033333 0.008386 0.078431 0.159561 28.366806 0.985748
6 Gaussian-mixture 5 0.031595 7.361796e-02 0.128434 0.081902 0.100021 0.335200 0.087500 0.066658 0.102477 0.316204 133.159943 0.984135
In [11]:
tsne_transformed=TSNE(n_components=3, n_jobs=-1).fit_transform(train_data)
tsne_transformed=pd.DataFrame(tsne_transformed)
actualDistribution(tsne_transformed,true_labels,true_encoded_labels)

plotAllClusterModels(train_data,'audio-results4/formantsLpc_muslim_features_vowel.csv-tsne')
Actual Distribution of the labels in the dataset: --> 
Counter({'a': 159, 'u': 47, 'misaligned': 13, 'cantdecide': 13, 'other': 8})
Counter({0: 159, 4: 47, 2: 13, 1: 13, 3: 8})
audio-results4/formantsLpc_muslim_features_vowel.csv-tsne_kmeans_labels.npy
K-Means
predicted_labels--> Counter({1: 52, 2: 52, 3: 51, 0: 43, 4: 42})
 2D representation
 3D representation
Agglomerative
Too many labels to show
Birch
Too many labels to show
DBSCAN
predicted_labels--> Counter({0: 151, -1: 56, 2: 21, 1: 12})
 2D representation
 3D representation
OPTICS
predicted_labels--> Counter({-1: 223, 0: 17})
 2D representation
 3D representation
MEAN-SHIFT
predicted_labels--> Counter({0: 240})
 2D representation
 3D representation
Gaussian-Mixture
predicted_labels--> Counter({3: 55, 1: 51, 4: 51, 2: 45, 0: 38})
 2D representation
 3D representation
In [ ]:
 

Umap transformed data

In [12]:
results_df=pd.read_csv('audio-results4/formantsLpc_muslim_features_vowel.csv-umap.csv')
results_df
Out[12]:
Unnamed: 0 n_clusters ARI AMI H C V FM A R P silhouette calinski davies
0 K-Means 5 0.010495 0.029110 0.072641 0.046922 0.057016 0.324422 0.154167 0.250250 0.197038 0.514070 677.345242 0.710064
1 Agglomerative clustering 2 0.001517 -0.000223 0.009134 0.024759 0.013344 0.613968 0.587500 0.191485 0.140000 0.694544 414.206434 0.307543
2 Birch 11 0.009812 0.045064 0.162791 0.071003 0.098879 0.228220 0.120833 0.048673 0.080128 0.582996 990.221919 0.581586
3 DBSCAN 19 -0.009454 0.037972 0.217559 0.085119 0.122364 0.227683 0.037500 0.003580 0.046364 0.210222 99.856370 1.094046
4 Mean-shift 2 0.001517 -0.000223 0.009134 0.024759 0.013344 0.613968 0.587500 0.191485 0.140000 0.694544 414.206434 0.307543
5 Optics 2 0.021125 0.007225 0.024807 0.027400 0.026039 0.475171 0.175000 0.055797 0.107546 0.518518 553.795764 0.588591
6 Gaussian-mixture 5 0.018025 0.022490 0.064302 0.041974 0.050793 0.335072 0.133333 0.097619 0.174703 0.483828 607.227547 0.772536
In [13]:
umap_transformed= umap.UMAP(random_state=42,n_components=3).fit_transform(train_data)
umap_transformed=pd.DataFrame(umap_transformed)
actualDistribution(umap_transformed,true_labels,true_encoded_labels)
plotAllClusterModels(umap_transformed,'audio-results4/formantsLpc_muslim_features_vowel.csv-umap')
Actual Distribution of the labels in the dataset: --> 
Counter({'a': 159, 'u': 47, 'misaligned': 13, 'cantdecide': 13, 'other': 8})
Counter({0: 159, 4: 47, 2: 13, 1: 13, 3: 8})
audio-results4/formantsLpc_muslim_features_vowel.csv-umap_kmeans_labels.npy
K-Means
predicted_labels--> Counter({2: 63, 4: 55, 3: 49, 1: 43, 0: 30})
 2D representation
 3D representation
Agglomerative
predicted_labels--> Counter({0: 210, 1: 30})
 2D representation
 3D representation
Birch
predicted_labels--> Counter({0: 39, 1: 30, 2: 25, 4: 24, 5: 23, 8: 23, 9: 19, 3: 15, 7: 15, 6: 14, 10: 13})
 2D representation
 3D representation
DBSCAN
predicted_labels--> Counter({-1: 72, 14: 19, 1: 14, 3: 13, 2: 12, 0: 11, 6: 11, 5: 10, 7: 10, 16: 9, 12: 8, 9: 8, 11: 7, 10: 7, 8: 5, 4: 5, 15: 5, 13: 5, 18: 5, 17: 4})
 2D representation
 3D representation
OPTICS
predicted_labels--> Counter({-1: 143, 0: 67, 1: 30})
 2D representation
 3D representation
MEAN-SHIFT
predicted_labels--> Counter({0: 210, 1: 30})
 2D representation
 3D representation
Gaussian-Mixture
predicted_labels--> Counter({3: 65, 2: 62, 4: 49, 0: 34, 1: 30})
 2D representation
 3D representation
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]: